Load Data
data.loaded <-
readr::read_delim(
file = "data/publications.txt",
delim = "\t",
escape_double = FALSE,
col_names = FALSE,
trim_ws = TRUE
)
## Rows: 97 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: "\t"
## chr (1): X1
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
Convert Titles to Lower Case
data$"Title.lower" <-
tolower( data$"Title" )
# Neutralize plurals and verbs.
# data$"Title.lower" <-
# stringr::str_replace_all(
# string = data$"Title.lower",
# pattern = "(s\\s)",
# replacement = " "
# )
#
# data$"Title.lower" <-
# stringr::str_replace_all(
# string = data$"Title.lower",
# pattern = "(s\\:)",
# replacement = ""
# )
#
# data$"Title.lower" <-
# stringr::str_replace_all(
# string = data$"Title.lower",
# pattern = "(s$)",
# replacement = ""
# )
data$"Title.lower" <-
stringr::str_replace_all(
string = data$"Title.lower",
# pattern = "type 1 diabete",
pattern = "type 1 diabetes",
replacement = "type_1_diabetes"
)
data$"Title.lower" <-
stringr::str_replace_all(
string = data$"Title.lower",
# pattern = "type 2 diabete",
pattern = "type 2 diabetes",
replacement = "type_2_diabetes"
)
Omit Punctuation from Titles
words <-
stringr::str_split(
string = data[ , "Title.lower" ],
pattern = "\\s"
)
words <-
lapply(
X = words,
FUN = tolower
)
words <-
lapply(
X = words,
FUN = stringr::str_replace_all,
pattern = "(\\()|(\\))|(\\:)|(\\,)|(\\.)",
replacement = ""
)
table.words <-
sort(
x = table( unlist( words ) ),
decreasing = TRUE
)
Define List of Blocked Words
blocklist <-
c(
"in",
"of",
"and",
"the",
"with",
"for",
"from",
"to",
"...",
"an",
"at",
"on",
letters,
"after",
"are",
"as",
"based",
"by",
"do",
"is",
"not",
"without",
"through",
"during"
)
# words.unique <- words.unique[ !( words.unique %in% blocklist ) ]
blocklist <-
c(
blocklist,
words.unique[ grepl( x = words.unique, pattern = "^[0-9]+$" ) ]
)
words <-
lapply(
X = words,
FUN = function( x ) {
x[ !( x %in% blocklist ) ]
}
)
Create the Words-by-Articles Matrix
wba <-
array(
data = 0,
dim = c( length( words.unique ), nrow( data ) )
)
rownames( wba ) <- words.unique
colnames( wba ) <- data[ , "Title" ]
for ( i in 1:nrow( data ) ) {
wba[ words[[ i ]], i ] <- 1
}
Compute PCA
wba.norm <- scale( wba.log1px )
# wba.norm <- scale( wba )
result.pca <-
princomp(
x = wba.norm
)
Plot PCA
library( "ggfortify" )
## Warning: package 'ggfortify' was built under R version 4.0.5
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
plot <-
autoplot(
result.pca
,
# loadings = TRUE,
# loadings.label = TRUE
)
print( plot )

Cluster Words
result.clustering <-
cluster::pam(
x = wba.norm,
k = 8
)
# result.clustering <-
# cluster::pam(
# x = result.pca$"scores"[ , 1:2 ],
# k = 8
# )
Prepare Data for Plot
jitter <- 1
data.plot <-
data.frame(
Word = rownames( result.pca$"scores" ),
result.pca$"scores"
)
# data.plot$"Comp.1.jittered" <-
# data.plot$"Comp.1" +
# rnorm(
# n = nrow( data.plot ),
# sd = jitter
# )
#
# data.plot$"Comp.2.jittered" <-
# data.plot$"Comp.2" +
# rnorm(
# n = nrow( data.plot ),
# sd = jitter
# )
data.plot$"Distance" <-
sqrt( data.plot$"Comp.1"^2 + data.plot$"Comp.2"^2 ) + 1
data.plot$"Term" <-
stringr::str_replace_all(
string = data.plot$"Word",
pattern = "\\-",
replacement = "-\n"
)
data.plot$"Term" <-
stringr::str_replace_all(
string = data.plot$"Word",
pattern = "\\_",
replacement = "\n"
)
data.plot$"Cluster" <-
as.factor( result.clustering$"clustering" )
# as.factor(
# paste(
# "C",
# result.clustering$"clustering"
# )
# )
data.plot$"Publications" <-
# tmp <-
apply(
X = wba[ data.plot$"Word", ] == 1,
MAR = 1,
FUN = function( x ) {
paste(
"\n",
names(
which( x )
),
collapse = "\n"
)
# names( which( x ) )
}
)
View the Exported Widget
SessionInfo
utils::sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19042)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] tidyr_1.1.2 ggfortify_0.4.12 ggplot2_3.3.5
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.1.1 xfun_0.27 purrr_0.3.4 colorspace_2.0-0
## [5] vctrs_0.3.8 generics_0.1.0 htmltools_0.5.1.1 viridisLite_0.4.0
## [9] yaml_2.2.1 utf8_1.1.4 plotly_4.9.4.1 rlang_0.4.11
## [13] jquerylib_0.1.4 pillar_1.6.2 glue_1.4.2 withr_2.4.2
## [17] bit64_4.0.5 RColorBrewer_1.1-2 lifecycle_1.0.0 stringr_1.4.0
## [21] munsell_0.5.0 gtable_0.3.0 htmlwidgets_1.5.4 evaluate_0.14
## [25] labeling_0.4.2 knitr_1.34 tzdb_0.1.2 crosstalk_1.1.1
## [29] curl_4.3 parallel_4.0.4 fansi_0.4.2 highr_0.9
## [33] readr_2.0.1 scales_1.1.1 vroom_1.5.5 jsonlite_1.7.2
## [37] farver_2.1.0 bit_4.0.4 gridExtra_2.3 hms_1.1.0
## [41] digest_0.6.27 stringi_1.5.3 dplyr_1.0.4 grid_4.0.4
## [45] cli_3.0.1 tools_4.0.4 magrittr_2.0.1 lazyeval_0.2.2
## [49] tibble_3.0.6 cluster_2.1.2 crayon_1.4.1 pkgconfig_2.0.3
## [53] ellipsis_0.3.2 data.table_1.13.6 rmarkdown_2.11 httr_1.4.2
## [57] rstudioapi_0.13 R6_2.5.1 compiler_4.0.4